# Load dataset
# Load necessary libraries
library(readr)
## Warning: package 'readr' was built under R version 4.4.3
library(dplyr) # %>%
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Read the CSV file from Google Drive
url <- "https://drive.google.com/uc?export=download&id=12CriTyCML_9rGAySnmcitxXbij7N11T0"
my_data <- read_csv(url)
## Rows: 920 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): sex, dataset, cp, restecg, slope, thal
## dbl (8): id, age, trestbps, chol, thalch, oldpeak, ca, num
## lgl (2): fbs, exang
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View the first few rows
head(my_data)
#Removing Unnecessary Columns
my_data <- my_data %>%
select(-id, -dataset)
print(my_data)
## # A tibble: 920 × 14
## age sex cp trestbps chol fbs restecg thalch exang oldpeak slope
## <dbl> <chr> <chr> <dbl> <dbl> <lgl> <chr> <dbl> <lgl> <dbl> <chr>
## 1 63 Male typical… 145 233 TRUE lv hyp… 150 FALSE 2.3 down…
## 2 67 Male asympto… 160 286 FALSE lv hyp… 108 TRUE 1.5 flat
## 3 67 Male asympto… 120 229 FALSE lv hyp… 129 TRUE 2.6 flat
## 4 37 Male non-ang… 130 250 FALSE normal 187 FALSE 3.5 down…
## 5 41 Female atypica… 130 204 FALSE lv hyp… 172 FALSE 1.4 upsl…
## 6 56 Male atypica… 120 236 FALSE normal 178 FALSE 0.8 upsl…
## 7 62 Female asympto… 140 268 FALSE lv hyp… 160 FALSE 3.6 down…
## 8 57 Female asympto… 120 354 FALSE normal 163 TRUE 0.6 upsl…
## 9 63 Male asympto… 130 254 FALSE lv hyp… 147 FALSE 1.4 flat
## 10 53 Male asympto… 140 203 TRUE lv hyp… 155 TRUE 3.1 down…
## # ℹ 910 more rows
## # ℹ 3 more variables: ca <dbl>, thal <chr>, num <dbl>
#Checking for Missing Data
my_data[!complete.cases(my_data),]
#Handling Missing Values
my_data[is.na(my_data$trestbps),]
# Calculate the median
trestbps_medv <- median(my_data$trestbps, na.rm = TRUE)
print(trestbps_medv)
## [1] 130
# Replace NA values with the median
my_data$trestbps[is.na(my_data$trestbps)] <- trestbps_medv
# Print updated data
print(my_data)
## # A tibble: 920 × 14
## age sex cp trestbps chol fbs restecg thalch exang oldpeak slope
## <dbl> <chr> <chr> <dbl> <dbl> <lgl> <chr> <dbl> <lgl> <dbl> <chr>
## 1 63 Male typical… 145 233 TRUE lv hyp… 150 FALSE 2.3 down…
## 2 67 Male asympto… 160 286 FALSE lv hyp… 108 TRUE 1.5 flat
## 3 67 Male asympto… 120 229 FALSE lv hyp… 129 TRUE 2.6 flat
## 4 37 Male non-ang… 130 250 FALSE normal 187 FALSE 3.5 down…
## 5 41 Female atypica… 130 204 FALSE lv hyp… 172 FALSE 1.4 upsl…
## 6 56 Male atypica… 120 236 FALSE normal 178 FALSE 0.8 upsl…
## 7 62 Female asympto… 140 268 FALSE lv hyp… 160 FALSE 3.6 down…
## 8 57 Female asympto… 120 354 FALSE normal 163 TRUE 0.6 upsl…
## 9 63 Male asympto… 130 254 FALSE lv hyp… 147 FALSE 1.4 flat
## 10 53 Male asympto… 140 203 TRUE lv hyp… 155 TRUE 3.1 down…
## # ℹ 910 more rows
## # ℹ 3 more variables: ca <dbl>, thal <chr>, num <dbl>
my_data[is.na(my_data$trestbps),]
# Handling Missing Values
my_data[is.na(my_data$chol),]
chol_meanv<- mean(my_data$chol, na.rm=TRUE)
print(chol_meanv)
## [1] 199.1303
my_data$chol[is.na(my_data$chol)] <- chol_meanv
print(my_data)
## # A tibble: 920 × 14
## age sex cp trestbps chol fbs restecg thalch exang oldpeak slope
## <dbl> <chr> <chr> <dbl> <dbl> <lgl> <chr> <dbl> <lgl> <dbl> <chr>
## 1 63 Male typical… 145 233 TRUE lv hyp… 150 FALSE 2.3 down…
## 2 67 Male asympto… 160 286 FALSE lv hyp… 108 TRUE 1.5 flat
## 3 67 Male asympto… 120 229 FALSE lv hyp… 129 TRUE 2.6 flat
## 4 37 Male non-ang… 130 250 FALSE normal 187 FALSE 3.5 down…
## 5 41 Female atypica… 130 204 FALSE lv hyp… 172 FALSE 1.4 upsl…
## 6 56 Male atypica… 120 236 FALSE normal 178 FALSE 0.8 upsl…
## 7 62 Female asympto… 140 268 FALSE lv hyp… 160 FALSE 3.6 down…
## 8 57 Female asympto… 120 354 FALSE normal 163 TRUE 0.6 upsl…
## 9 63 Male asympto… 130 254 FALSE lv hyp… 147 FALSE 1.4 flat
## 10 53 Male asympto… 140 203 TRUE lv hyp… 155 TRUE 3.1 down…
## # ℹ 910 more rows
## # ℹ 3 more variables: ca <dbl>, thal <chr>, num <dbl>
my_data[is.na(my_data$chol),]
# Handling Missing Values
my_data[is.na(my_data$thalch),]
thalch_meanv<- mean(my_data$thalch, na.rm=TRUE)
print(thalch_meanv)
## [1] 137.5457
my_data$thalch[is.na(my_data$thalch)] <- thalch_meanv
print(my_data)
## # A tibble: 920 × 14
## age sex cp trestbps chol fbs restecg thalch exang oldpeak slope
## <dbl> <chr> <chr> <dbl> <dbl> <lgl> <chr> <dbl> <lgl> <dbl> <chr>
## 1 63 Male typical… 145 233 TRUE lv hyp… 150 FALSE 2.3 down…
## 2 67 Male asympto… 160 286 FALSE lv hyp… 108 TRUE 1.5 flat
## 3 67 Male asympto… 120 229 FALSE lv hyp… 129 TRUE 2.6 flat
## 4 37 Male non-ang… 130 250 FALSE normal 187 FALSE 3.5 down…
## 5 41 Female atypica… 130 204 FALSE lv hyp… 172 FALSE 1.4 upsl…
## 6 56 Male atypica… 120 236 FALSE normal 178 FALSE 0.8 upsl…
## 7 62 Female asympto… 140 268 FALSE lv hyp… 160 FALSE 3.6 down…
## 8 57 Female asympto… 120 354 FALSE normal 163 TRUE 0.6 upsl…
## 9 63 Male asympto… 130 254 FALSE lv hyp… 147 FALSE 1.4 flat
## 10 53 Male asympto… 140 203 TRUE lv hyp… 155 TRUE 3.1 down…
## # ℹ 910 more rows
## # ℹ 3 more variables: ca <dbl>, thal <chr>, num <dbl>
my_data[is.na(my_data$thalch),]
# Handling Missing Values
my_data[is.na(my_data$ca),]
# Find the most frequent value (mode) for 'ca'
mode_ca <- as.numeric(names(sort(table(my_data$ca), decreasing = TRUE))[1])
print(mode_ca)
## [1] 0
my_data$ca[is.na(my_data$ca)] <- mode_ca
print(my_data)
## # A tibble: 920 × 14
## age sex cp trestbps chol fbs restecg thalch exang oldpeak slope
## <dbl> <chr> <chr> <dbl> <dbl> <lgl> <chr> <dbl> <lgl> <dbl> <chr>
## 1 63 Male typical… 145 233 TRUE lv hyp… 150 FALSE 2.3 down…
## 2 67 Male asympto… 160 286 FALSE lv hyp… 108 TRUE 1.5 flat
## 3 67 Male asympto… 120 229 FALSE lv hyp… 129 TRUE 2.6 flat
## 4 37 Male non-ang… 130 250 FALSE normal 187 FALSE 3.5 down…
## 5 41 Female atypica… 130 204 FALSE lv hyp… 172 FALSE 1.4 upsl…
## 6 56 Male atypica… 120 236 FALSE normal 178 FALSE 0.8 upsl…
## 7 62 Female asympto… 140 268 FALSE lv hyp… 160 FALSE 3.6 down…
## 8 57 Female asympto… 120 354 FALSE normal 163 TRUE 0.6 upsl…
## 9 63 Male asympto… 130 254 FALSE lv hyp… 147 FALSE 1.4 flat
## 10 53 Male asympto… 140 203 TRUE lv hyp… 155 TRUE 3.1 down…
## # ℹ 910 more rows
## # ℹ 3 more variables: ca <dbl>, thal <chr>, num <dbl>
my_data[is.na(my_data$ca),]
# Handling Missing Values
my_data[is.na(my_data$thal),]
# Find the most frequent value (mode) for 'thal'
mode_thal <- as.character(names(sort(table(my_data$thal), decreasing = TRUE))[1])
print(mode_thal)
## [1] "normal"
my_data$thal[is.na(my_data$thal)] <- mode_thal
print(my_data)
## # A tibble: 920 × 14
## age sex cp trestbps chol fbs restecg thalch exang oldpeak slope
## <dbl> <chr> <chr> <dbl> <dbl> <lgl> <chr> <dbl> <lgl> <dbl> <chr>
## 1 63 Male typical… 145 233 TRUE lv hyp… 150 FALSE 2.3 down…
## 2 67 Male asympto… 160 286 FALSE lv hyp… 108 TRUE 1.5 flat
## 3 67 Male asympto… 120 229 FALSE lv hyp… 129 TRUE 2.6 flat
## 4 37 Male non-ang… 130 250 FALSE normal 187 FALSE 3.5 down…
## 5 41 Female atypica… 130 204 FALSE lv hyp… 172 FALSE 1.4 upsl…
## 6 56 Male atypica… 120 236 FALSE normal 178 FALSE 0.8 upsl…
## 7 62 Female asympto… 140 268 FALSE lv hyp… 160 FALSE 3.6 down…
## 8 57 Female asympto… 120 354 FALSE normal 163 TRUE 0.6 upsl…
## 9 63 Male asympto… 130 254 FALSE lv hyp… 147 FALSE 1.4 flat
## 10 53 Male asympto… 140 203 TRUE lv hyp… 155 TRUE 3.1 down…
## # ℹ 910 more rows
## # ℹ 3 more variables: ca <dbl>, thal <chr>, num <dbl>
my_data[is.na(my_data$thal),]
# Handling Missing Values
my_data[is.na(my_data$slope),]
# Find the most frequent value (mode) for 'slope'
mode_slope <- names(sort(table(my_data$slope), decreasing = TRUE))[1]
print(mode_slope)
## [1] "flat"
my_data$slope[is.na(my_data$slope)] <- mode_slope
print(my_data)
## # A tibble: 920 × 14
## age sex cp trestbps chol fbs restecg thalch exang oldpeak slope
## <dbl> <chr> <chr> <dbl> <dbl> <lgl> <chr> <dbl> <lgl> <dbl> <chr>
## 1 63 Male typical… 145 233 TRUE lv hyp… 150 FALSE 2.3 down…
## 2 67 Male asympto… 160 286 FALSE lv hyp… 108 TRUE 1.5 flat
## 3 67 Male asympto… 120 229 FALSE lv hyp… 129 TRUE 2.6 flat
## 4 37 Male non-ang… 130 250 FALSE normal 187 FALSE 3.5 down…
## 5 41 Female atypica… 130 204 FALSE lv hyp… 172 FALSE 1.4 upsl…
## 6 56 Male atypica… 120 236 FALSE normal 178 FALSE 0.8 upsl…
## 7 62 Female asympto… 140 268 FALSE lv hyp… 160 FALSE 3.6 down…
## 8 57 Female asympto… 120 354 FALSE normal 163 TRUE 0.6 upsl…
## 9 63 Male asympto… 130 254 FALSE lv hyp… 147 FALSE 1.4 flat
## 10 53 Male asympto… 140 203 TRUE lv hyp… 155 TRUE 3.1 down…
## # ℹ 910 more rows
## # ℹ 3 more variables: ca <dbl>, thal <chr>, num <dbl>
my_data[is.na(my_data$slope),]
#Checking for Remaining Missing Data
my_data[!complete.cases(my_data),]
#Remove records with missing values
my_data<-my_data[complete.cases(my_data),]
print(my_data)
## # A tibble: 770 × 14
## age sex cp trestbps chol fbs restecg thalch exang oldpeak slope
## <dbl> <chr> <chr> <dbl> <dbl> <lgl> <chr> <dbl> <lgl> <dbl> <chr>
## 1 63 Male typical… 145 233 TRUE lv hyp… 150 FALSE 2.3 down…
## 2 67 Male asympto… 160 286 FALSE lv hyp… 108 TRUE 1.5 flat
## 3 67 Male asympto… 120 229 FALSE lv hyp… 129 TRUE 2.6 flat
## 4 37 Male non-ang… 130 250 FALSE normal 187 FALSE 3.5 down…
## 5 41 Female atypica… 130 204 FALSE lv hyp… 172 FALSE 1.4 upsl…
## 6 56 Male atypica… 120 236 FALSE normal 178 FALSE 0.8 upsl…
## 7 62 Female asympto… 140 268 FALSE lv hyp… 160 FALSE 3.6 down…
## 8 57 Female asympto… 120 354 FALSE normal 163 TRUE 0.6 upsl…
## 9 63 Male asympto… 130 254 FALSE lv hyp… 147 FALSE 1.4 flat
## 10 53 Male asympto… 140 203 TRUE lv hyp… 155 TRUE 3.1 down…
## # ℹ 760 more rows
## # ℹ 3 more variables: ca <dbl>, thal <chr>, num <dbl>
my_data[!complete.cases(my_data),]
# Summary of numeric variables
summary(my_data)
## age sex cp trestbps
## Min. :28.00 Length:770 Length:770 Min. : 0.0
## 1st Qu.:46.00 Class :character Class :character 1st Qu.:120.0
## Median :54.00 Mode :character Mode :character Median :130.0
## Mean :53.04 Mean :132.8
## 3rd Qu.:59.75 3rd Qu.:140.0
## Max. :77.00 Max. :200.0
## chol fbs restecg thalch
## Min. : 0.0 Mode :logical Length:770 Min. : 60.0
## 1st Qu.:198.0 FALSE:656 Class :character 1st Qu.:120.0
## Median :228.5 TRUE :114 Mode :character Median :140.0
## Mean :218.8 Mean :138.6
## 3rd Qu.:269.0 3rd Qu.:159.0
## Max. :603.0 Max. :202.0
## exang oldpeak slope ca
## Mode :logical Min. :-1.0000 Length:770 Min. :0.000
## FALSE:466 1st Qu.: 0.0000 Class :character 1st Qu.:0.000
## TRUE :304 Median : 0.5000 Mode :character Median :0.000
## Mean : 0.8874 Mean :0.261
## 3rd Qu.: 1.5000 3rd Qu.:0.000
## Max. : 6.2000 Max. :3.000
## thal num
## Length:770 Min. :0.0000
## Class :character 1st Qu.:0.0000
## Mode :character Median :1.0000
## Mean :0.9195
## 3rd Qu.:1.0000
## Max. :4.0000
hist(my_data$age,
main="Histogram of Age",
col="lightblue",
xlab="Age",
border="black")
hist(my_data$trestbps,
main="Histogram of trestbps ",
col="green",
xlab="trestbps",
border="black")
# Univariate Plots
hist(my_data$chol,
main="Histogram of chol",
col="yellow",
xlab="chol",
border="black")
hist(my_data$thalch,
main="Histogram of thalch ",
col="blue",
xlab="thalch",
border="black")
hist(my_data$oldpeak,
main="Histogram of oldpeak",
col="lightblue",
xlab="oldpeak",
border="black")
hist(my_data$ca,
main="Histogram of Chest pain",
col="lightblue",
xlab="Ca",
border="black")
hist(my_data$num,
main="Histogram of num",
col="lightblue",
xlab="num",
border="black")
# Convert 'restecg' to numeric
my_data$restecg <- as.numeric(factor(my_data$restecg, levels = c("normal", "lv hypertrophy", "st-t abnormality")))
# Convert 'fbs' (fasting blood sugar) to numeric
my_data$fbs <- ifelse(my_data$fbs == TRUE, 1, 0)
# Convert 'slope' to numeric
my_data$slope <- as.numeric(factor(my_data$slope, levels = c("upsloping", "flat", "downsloping")))
my_data$num <- as.factor(my_data$num)
# Update the levels of 'num' to represent meaningful categories
levels(my_data$num) <- c("No Heart Disease", "Heart Disease -Type1", "Heart Disease -Type2", "Heart Disease -Type3","Heart Disease -Type4")
str(my_data)
## tibble [770 × 14] (S3: tbl_df/tbl/data.frame)
## $ age : num [1:770] 63 67 67 37 41 56 62 57 63 53 ...
## $ sex : chr [1:770] "Male" "Male" "Male" "Male" ...
## $ cp : chr [1:770] "typical angina" "asymptomatic" "asymptomatic" "non-anginal" ...
## $ trestbps: num [1:770] 145 160 120 130 130 120 140 120 130 140 ...
## $ chol : num [1:770] 233 286 229 250 204 236 268 354 254 203 ...
## $ fbs : num [1:770] 1 0 0 0 0 0 0 0 0 1 ...
## $ restecg : num [1:770] 2 2 2 1 2 1 2 1 2 2 ...
## $ thalch : num [1:770] 150 108 129 187 172 178 160 163 147 155 ...
## $ exang : logi [1:770] FALSE TRUE TRUE FALSE FALSE FALSE ...
## $ oldpeak : num [1:770] 2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
## $ slope : num [1:770] 3 2 2 3 1 1 3 1 2 3 ...
## $ ca : num [1:770] 0 3 2 0 0 0 2 0 1 0 ...
## $ thal : chr [1:770] "fixed defect" "normal" "reversable defect" "normal" ...
## $ num : Factor w/ 5 levels "No Heart Disease",..: 1 3 2 1 1 1 4 1 3 2 ...
# Print the first few rows to verify the new labels
head(my_data)
hist(my_data$restecg,
main="Histogram of restecg",
col="lightblue",
xlab="restecg",
border="black")
hist(my_data$fbs,
main="Histogram of fbs",
col="lightblue",
xlab="fbs",
border="black")
hist(my_data$slope,
main="Histogram of slope",
col="lightblue",
xlab="slope",
border="black")
# Density Plot (Smooth Distribution)
plot(density(my_data$age),
main="Density Plot of Age",
col="blue",
lwd=2)
# Density Plot (Smooth Distribution)
plot(density(my_data$trestbps),
main="Density Plot of trestbps",
col="blue",
lwd=2)
# Density Plot (Smooth Distribution)
plot(density(my_data$chol),
main="Density Plot of chol ",
col="blue",
lwd=2)
# Density Plot (Smooth Distribution)
plot(density(my_data$thalch),
main="Density Plot of thalch",
col="blue",
lwd=2)
# Density Plot (Smooth Distribution)
plot(density(my_data$oldpeak),
main="Density Plot of oldpeak",
col="blue",
lwd=2)
# Density Plot (Smooth Distribution)
plot(density(my_data$slope),
main="Density Plot of slope",
col="blue",
lwd=2)
# Density Plot (Smooth Distribution)
plot(density(my_data$restecg),
main="Density Plot of restecg",
col="blue",
lwd=2)
# Boxplot
boxplot(my_data$age,
main="Box Plot of Age",
col="orange")
# Boxplot
boxplot(my_data$trestbps,
main="Box Plot of trestbps ",
col="orange")
# Boxplot
boxplot(my_data$chol,
main="Box Plot of chol",
col="orange")
# Boxplot
boxplot(my_data$restecg,
main="Box Plot of restecg ",
col="orange")
# Boxplot
boxplot(my_data$thalch,
main="Box Plot of thalch",
col="orange")
# Boxplot
boxplot(my_data$oldpeak,
main="Box Plot of oldpeak",
col="orange")
# Count of each num of affected people
table(my_data$num)
##
## No Heart Disease Heart Disease -Type1 Heart Disease -Type2
## 375 210 81
## Heart Disease -Type3 Heart Disease -Type4
## 80 24
# Bar plot for count
barplot(table(my_data$num),
main="Count of num",
col=c("red", "green", "blue"))
# Bar plot for count
barplot(table(my_data$sex),
main="Count of sex",
col=c("red", "green", "blue"))
# Bar plot for count
barplot(table(my_data$cp),
main="Count of cp",
col=c("red", "green", "blue"))
# Bar plot for count
barplot(table(my_data$trestbps),
main="Count of trestbps",
col=c("red", "green", "blue"))
# Bar plot for count
barplot(table(my_data$slope),
main="Count of slope",
col=c("red", "green", "blue"))
# Bar plot for Species count
barplot(table(my_data$thal),
main="Count of thal",
col=c("red", "green", "blue"))
# Scatter plot: age vs trestbps
plot(my_data$age, my_data$trestbps,
main="Scatter Plot of Age vs trestbps",
xlab="age", ylab="trestbps",
col=my_data$num, pch=20)
legend("bottomright", legend=levels(my_data$num), col=1:4, pch=19)
# Multivariate Exploration # Scatter Plot (Numeric vs. Numeric)
# Scatter plot: thalch vs chol
plot(my_data$thalch, my_data$chol,
main="Scatter Plot of thalch vs chol",
xlab="thalch", ylab="chol",
col=my_data$num, pch=20)
legend("topleft", legend=levels(my_data$num), col=1:4, pch=20)
# Scatter plot: thalch vs trestbps
plot(my_data$thalch, my_data$trestbps,
main="Scatter Plot of thalch vs trestbps",
xlab="thalch", ylab="trestbps",
col=my_data$num, pch=20)
legend("bottomleft", legend=levels(my_data$num), col=1:4, pch=20)
# Correlation between age and trestbs
cor(my_data$age, my_data$trestbps)
## [1] 0.2409653
# Correlation between age and trestbs
cor(my_data$age, my_data$chol)
## [1] -0.06950973
# Correlation between chol and trestbs
cor(my_data$chol, my_data$trestbps)
## [1] 0.0599012
# Correlation between thalch and trestbs
cor(my_data$thalch, my_data$trestbps)
## [1] -0.109774
# Correlation between age and thalch
cor(my_data$age, my_data$thalch)
## [1] -0.3739944
# Correlation between age and slope
cor(my_data$age, my_data$slope)
## [1] 0.06146084
# Select only numeric columns
numeric_cols <- sapply(my_data, is.numeric)
correlation_matrix <- cor(my_data[, numeric_cols], use = "complete.obs")
# Print the correlation matrix
print(correlation_matrix)
## age trestbps chol fbs restecg
## age 1.00000000 0.24096532 -0.06950973 0.22647439 0.20749775
## trestbps 0.24096532 1.00000000 0.05990120 0.15692273 0.10034657
## chol -0.06950973 0.05990120 1.00000000 0.02907186 -0.06549605
## fbs 0.22647439 0.15692273 0.02907186 1.00000000 0.12153080
## restecg 0.20749775 0.10034657 -0.06549605 0.12153080 1.00000000
## thalch -0.37399438 -0.10977400 0.19940868 -0.05949849 -0.09568953
## oldpeak 0.26091537 0.17080414 0.05526483 0.05546392 0.10888213
## slope 0.06146084 0.08097304 -0.05522410 0.06742084 0.05004836
## ca 0.24956877 0.02804014 0.15579945 0.07785460 0.03359935
## thalch oldpeak slope ca
## age -0.37399438 0.26091537 0.06146084 0.24956877
## trestbps -0.10977400 0.17080414 0.08097304 0.02804014
## chol 0.19940868 0.05526483 -0.05522410 0.15579945
## fbs -0.05949849 0.05546392 0.06742084 0.07785460
## restecg -0.09568953 0.10888213 0.05004836 0.03359935
## thalch 1.00000000 -0.18268766 -0.31214858 0.03739947
## oldpeak -0.18268766 1.00000000 0.31268535 0.23154038
## slope -0.31214858 0.31268535 1.00000000 -0.10682672
## ca 0.03739947 0.23154038 -0.10682672 1.00000000
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
# Calculate correlation matrix
numeric_cols <- sapply(my_data, is.numeric)
correlation_matrix <- cor(my_data[, numeric_cols], use = "complete.obs")
# Create heatmap with correlation values
corrplot(correlation_matrix,
method = "color",
type = "upper",
order = "hclust",
tl.col = "black",
tl.srt = 45,
addCoef.col = "black",
number.cex = 0.7,
diag = FALSE)
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(
x = colnames(correlation_matrix),
y = colnames(correlation_matrix),
z = correlation_matrix,
type = "heatmap",
colorscale = "Viridis",
hoverinfo = "x+y+z"
) %>%
layout(
title = "Interactive Correlation Matrix",
xaxis = list(tickangle = 45),
margin = list(l = 100, r = 100, b = 150)
)
# Boxplot: Cholesterol by Heart Disease Status
boxplot(chol ~ num, data = my_data,
main = "Boxplot of Cholesterol by Heart Disease Status",
xlab = "Heart Disease",
ylab = "Cholesterol",
col = c("skyblue", "tomato"))
# Box Plot (Numeric vs. Categorical)
# Boxplot: Thalch by Sex
boxplot(thalch ~ sex, data = my_data,
main = "Boxplot of Max Heart Rate by Sex",
xlab = "Sex",
ylab = "Max Heart Rate (thalch)",
col = c("lightpink", "lightblue"))
# Box Plot (Numeric vs. Categorical)
# Boxplot: Oldpeak by Chest Pain Type
boxplot(oldpeak ~ cp, data = my_data,
main = "Boxplot of ST Depression (Oldpeak) by Chest Pain Type",
xlab = "Chest Pain Type",
ylab = "Oldpeak",
col = rainbow(length(unique(my_data$cp))))
# Box Plot (Numeric vs. Categorical)
# Boxplot: Trestbps by Slope
boxplot(trestbps ~ slope, data = my_data,
main = "Boxplot of Resting BP by Slope of Peak Exercise ST",
xlab = "Slope Type",
ylab = "Resting Blood Pressure",
col = c("orange", "cyan", "lightgreen"))
library(ggplot2)
ggplot(my_data, aes(x = as.factor(num), y = chol, fill = as.factor(num))) +
geom_boxplot() +
ggtitle("Cholesterol Levels by Heart Disease Class") +
xlab("Heart Disease Class (num)") +
ylab("Serum Cholesterol (mg/dl)") +
theme_minimal()
install.packages(“ggplot2”) # Pair Plot (Multivariate Visualization)
pairs(my_data[, c("age", "trestbps", "chol", "thalch", "oldpeak")],
col = my_data$num,
pch = 19,
main = "Pair Plot of Selected Features Colored by Heart Disease")
library(ggplot2)
my_data$num <- as.factor(my_data$num)
ggplot(my_data, aes(x = age, y = thalch, color = num)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
ggtitle("Age vs Max Heart Rate (thalch) by Heart Disease Class") +
xlab("Age") +
ylab("Max Heart Rate Achieved (thalch)")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(my_data, aes(x = trestbps, y = chol, color = num)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
ggtitle("Resting Blood Pressure vs Cholesterol by Heart Disease Class") +
xlab("Resting Blood Pressure (trestbps)") +
ylab("Serum Cholesterol (chol)")
## `geom_smooth()` using formula = 'y ~ x'
# Scatter plot with regression line for thalch vs chol
ggplot(my_data, aes(x = thalch, y = chol, color = num)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
ggtitle("Maximum Heart Rate Achieved vs Cholesterol by Heart Disease Class") +
xlab("Maximum Heart Rate Achieved (thalch)") +
ylab("Serum Cholesterol (chol)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Using ggplot2 for Enhanced Visualization
# Load necessary libraries
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
# Select numeric columns for scaling
numeric_cols <- c("age", "trestbps", "chol", "thalch", "oldpeak")
# Apply standardization (z-score normalization)
preProcValues <- preProcess(my_data[, numeric_cols], method = c("center", "scale"))
my_data_scaled <- predict(preProcValues, my_data[, numeric_cols])
# Replace original columns with scaled versions
my_data[, numeric_cols] <- my_data_scaled
# Verify the transformation
summary(my_data[, numeric_cols])
## age trestbps chol thalch
## Min. :-2.6527 Min. :-7.1575 Min. :-2.3638 Min. :-3.04530
## 1st Qu.:-0.7459 1st Qu.:-0.6883 1st Qu.:-0.2244 1st Qu.:-0.72072
## Median : 0.1015 Median :-0.1492 Median : 0.1052 Median : 0.05414
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.7106 3rd Qu.: 0.3899 3rd Qu.: 0.5428 3rd Qu.: 0.79026
## Max. : 2.5380 Max. : 3.6245 Max. : 4.1519 Max. : 2.45621
## oldpeak
## Min. :-1.7406
## 1st Qu.:-0.8184
## Median :-0.3573
## Mean : 0.0000
## 3rd Qu.: 0.5649
## Max. : 4.8994
# Binary encoding for sex (Male=1, Female=0)
my_data$sex <- ifelse(my_data$sex == "Male", 1, 0)
# Ensure fbs is integer (fasting blood sugar > 120 mg/dl)
my_data$fbs <- as.integer(my_data$fbs)
# Convert target variable to factor
my_data$num <- as.factor(my_data$num)
# Identify numeric columns for scaling (excluding already encoded categoricals and target)
numeric_cols <- c("age", "trestbps", "chol", "thalch", "oldpeak", "ca")
# Apply standardization (z-score normalization)
library(caret)
preProcValues <- preProcess(my_data[, numeric_cols], method = c("center", "scale"))
# View the structure of your data
str(my_data)
## tibble [770 × 14] (S3: tbl_df/tbl/data.frame)
## $ age : num [1:770] 1.05 1.48 1.48 -1.7 -1.28 ...
## $ sex : num [1:770] 1 1 1 1 0 1 0 0 1 1 ...
## $ cp : chr [1:770] "typical angina" "asymptomatic" "asymptomatic" "non-anginal" ...
## $ trestbps: num [1:770] 0.659 1.468 -0.688 -0.149 -0.149 ...
## $ chol : num [1:770] 0.154 0.727 0.111 0.338 -0.16 ...
## $ fbs : int [1:770] 1 0 0 0 0 0 0 0 0 1 ...
## $ restecg : num [1:770] 2 2 2 1 2 1 2 1 2 2 ...
## $ thalch : num [1:770] 0.442 -1.186 -0.372 1.875 1.294 ...
## $ exang : logi [1:770] FALSE TRUE TRUE FALSE FALSE FALSE ...
## $ oldpeak : num [1:770] 1.303 0.565 1.579 2.409 0.473 ...
## $ slope : num [1:770] 3 2 2 3 1 1 3 1 2 3 ...
## $ ca : num [1:770] 0 3 2 0 0 0 2 0 1 0 ...
## $ thal : chr [1:770] "fixed defect" "normal" "reversable defect" "normal" ...
## $ num : Factor w/ 5 levels "No Heart Disease",..: 1 3 2 1 1 1 4 1 3 2 ...
# Check the first few rows
head(my_data)
# Function to detect outliers using IQR method
detect_outliers <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
x < lower_bound | x > upper_bound
}
# Check for outliers in numeric columns
outliers_list <- lapply(my_data[, numeric_cols], detect_outliers)
outliers_count <- sapply(outliers_list, sum)
print(outliers_count)
## age trestbps chol thalch oldpeak ca
## 0 25 99 1 15 123
# Boxplots to visualize outliers
par(mfrow = c(2, 3))
for (col in numeric_cols) {
boxplot(my_data[[col]], main = paste("Boxplot of", col))
}
par(mfrow = c(1, 1))
# 1. Robust Scaling
library(caret)
robust_scaler <- preProcess(my_data[, numeric_cols],
method = c("center", "scale", "YeoJohnson"))
my_data[, numeric_cols] <- predict(robust_scaler, my_data[, numeric_cols])
# 2. Check for zero-IQR columns
zero_iqr_cols <- sapply(my_data[, numeric_cols], function(x) IQR(x, na.rm = TRUE) == 0)
if(any(zero_iqr_cols)) {
message("Columns with IQR=0: ", paste(names(zero_iqr_cols)[zero_iqr_cols], collapse = ", "))
valid_cols <- names(zero_iqr_cols)[!zero_iqr_cols]
}
## Columns with IQR=0: ca
# 3. Multivariate Outlier Detection (with fallback)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
if(length(valid_cols) > 1) { # Need at least 2 columns for covariance
tryCatch({
mcd <- cov.mcd(my_data[, valid_cols])
mahalanobis_dist <- mahalanobis(my_data[, valid_cols],
mcd$center,
mcd$cov)
cutoff <- qchisq(0.99, df = length(valid_cols))
outliers <- mahalanobis_dist > cutoff
# 4. Handle outliers (capping approach)
my_data[outliers, valid_cols] <-
apply(my_data[outliers, valid_cols], 2,
function(x) pmin(pmax(x,
quantile(x, 0.01, na.rm = TRUE)),
quantile(x, 0.99, na.rm = TRUE)))
}, error = function(e) {
message("MCD failed, using regular Mahalanobis: ", e$message)
# Fallback to regular Mahalanobis
mahalanobis_dist <- mahalanobis(my_data[, valid_cols],
colMeans(my_data[, valid_cols]),
cov(my_data[, valid_cols]))
cutoff <- qchisq(0.99, df = length(valid_cols))
outliers <- mahalanobis_dist > cutoff
})
} else {
message("Insufficient valid columns for multivariate outlier detection")
outliers <- rep(FALSE, nrow(my_data))
}
# 5. Final Verification
summary(my_data[, numeric_cols])
## age trestbps chol
## Min. :-2.4825139 Min. :-2.335582 Min. :-2.045079
## 1st Qu.:-0.7616226 1st Qu.:-0.673934 1st Qu.:-0.333804
## Median : 0.0639728 Median :-0.107625 Median : 0.001456
## Mean : 0.0005325 Mean : 0.008164 Mean :-0.001047
## 3rd Qu.: 0.6966808 3rd Qu.: 0.426799 3rd Qu.: 0.498388
## Max. : 2.7240781 Max. : 3.268579 Max. : 5.241707
## thalch oldpeak ca
## Min. :-2.6499355 Min. :-1.997013 Min. :-0.390
## 1st Qu.:-0.7452664 1st Qu.:-0.956260 1st Qu.:-0.390
## Median : 0.0024077 Median :-0.111733 Median :-0.390
## Mean :-0.0007927 Mean : 0.001874 Mean : 0.000
## 3rd Qu.: 0.7790438 3rd Qu.: 0.880230 3rd Qu.:-0.390
## Max. : 2.6990588 Max. : 2.361611 Max. : 4.092
boxplot(my_data[, numeric_cols], main = "Post-Processing Distributions")